{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "QFdG4eYf3h0L"
      },
      "source": [
        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mongodb-developer/GenAI-Showcase/blob/main/notebooks/rag/Haystack_MongoDB_Atlas_RAG.ipynb)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "rrobdhRcNb5I"
      },
      "source": [
        "# Haystack and MongoDB Atlas RAG notebook\n",
        "\n",
        "Install dependencies:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "76dK0ehtNY2L",
        "outputId": "8268021e-8ba1-48e3-e3df-c16625285d31"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Collecting haystack-ai\n",
            "  Downloading haystack_ai-2.1.2-py3-none-any.whl (319 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m319.5/319.5 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting mongodb-atlas-haystack\n",
            "  Downloading mongodb_atlas_haystack-0.3.0-py3-none-any.whl (13 kB)\n",
            "Collecting tiktoken\n",
            "  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m29.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting boilerpy3 (from haystack-ai)\n",
            "  Downloading boilerpy3-1.0.7-py3-none-any.whl (22 kB)\n",
            "Collecting haystack-bm25 (from haystack-ai)\n",
            "  Downloading haystack_bm25-1.0.2-py2.py3-none-any.whl (8.8 kB)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (3.1.4)\n",
            "Collecting lazy-imports (from haystack-ai)\n",
            "  Downloading lazy_imports-0.3.1-py3-none-any.whl (12 kB)\n",
            "Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (10.1.0)\n",
            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (3.3)\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (1.25.2)\n",
            "Collecting openai>=1.1.0 (from haystack-ai)\n",
            "  Downloading openai-1.30.5-py3-none-any.whl (320 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m320.7/320.7 kB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (2.0.3)\n",
            "Collecting posthog (from haystack-ai)\n",
            "  Downloading posthog-3.5.0-py2.py3-none-any.whl (41 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.3/41.3 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (2.8.2)\n",
            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (6.0.1)\n",
            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (2.31.0)\n",
            "Requirement already satisfied: tenacity in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (8.3.0)\n",
            "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (4.66.4)\n",
            "Requirement already satisfied: typing-extensions>=4.7 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (4.11.0)\n",
            "Collecting pymongo[srv] (from mongodb-atlas-haystack)\n",
            "  Downloading pymongo-4.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (670 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m670.0/670.0 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken) (2024.5.15)\n",
            "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (3.7.1)\n",
            "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai>=1.1.0->haystack-ai) (1.7.0)\n",
            "Collecting httpx<1,>=0.23.0 (from openai>=1.1.0->haystack-ai)\n",
            "  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (2.7.1)\n",
            "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (1.3.1)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai) (3.3.2)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai) (3.7)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai) (2.0.7)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai) (2024.2.2)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->haystack-ai) (2.1.5)\n",
            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai) (2023.4)\n",
            "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai) (2024.1)\n",
            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil->haystack-ai) (1.16.0)\n",
            "Collecting monotonic>=1.5 (from posthog->haystack-ai)\n",
            "  Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n",
            "Collecting backoff>=1.10.0 (from posthog->haystack-ai)\n",
            "  Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n",
            "Collecting dnspython<3.0.0,>=1.16.0 (from pymongo[srv]->mongodb-atlas-haystack)\n",
            "  Downloading dnspython-2.6.1-py3-none-any.whl (307 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m307.7/307.7 kB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai>=1.1.0->haystack-ai) (1.2.1)\n",
            "Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai)\n",
            "  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai)\n",
            "  Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai>=1.1.0->haystack-ai) (0.7.0)\n",
            "Requirement already satisfied: pydantic-core==2.18.2 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai>=1.1.0->haystack-ai) (2.18.2)\n",
            "Installing collected packages: monotonic, lazy-imports, haystack-bm25, h11, dnspython, boilerpy3, backoff, tiktoken, pymongo, posthog, httpcore, httpx, openai, haystack-ai, mongodb-atlas-haystack\n",
            "Successfully installed backoff-2.2.1 boilerpy3-1.0.7 dnspython-2.6.1 h11-0.14.0 haystack-ai-2.1.2 haystack-bm25-1.0.2 httpcore-1.0.5 httpx-0.27.0 lazy-imports-0.3.1 mongodb-atlas-haystack-0.3.0 monotonic-1.6 openai-1.30.5 posthog-3.5.0 pymongo-4.7.2 tiktoken-0.7.0\n"
          ]
        }
      ],
      "source": [
        "pip install haystack-ai mongodb-atlas-haystack tiktoken"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "aeg_wcIiPYnY"
      },
      "source": [
        "\n",
        "## Setup MongoDB Atlas connection and Open AI\n",
        "\n",
        "\n",
        "* Set the MongoDB connection string. Follow the steps [here](https://www.mongodb.com/docs/manual/reference/connection-string/) to get the connection string from the Atlas UI.\n",
        "\n",
        "* Set the OpenAI API key. Steps to obtain an API key as [here](https://help.openai.com/en/articles/4936850-where-do-i-find-my-openai-api-key)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "id": "MZokdDxIPb9p"
      },
      "outputs": [],
      "source": [
        "import getpass\n",
        "import os"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "57gYJTBVPfBX",
        "outputId": "9a3841f2-7b7d-4250-bed2-6b139415b5d5"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Enter your MongoDB connection string:··········\n"
          ]
        }
      ],
      "source": [
        "os.environ[\"MONGO_CONNECTION_STRING\"] = getpass.getpass(\n",
        "    \"Enter your MongoDB connection string:\"\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "J8Gd-SMuRSH-",
        "outputId": "bd1135cd-c508-4e4a-d57b-a0d15a82d126"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Enter your Open AI Key:··········\n"
          ]
        }
      ],
      "source": [
        "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter your Open AI Key:\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HnG43-yPRIl3"
      },
      "source": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Fv1pPHqXQFa-"
      },
      "source": [
        "## Create vector search index on collection\n",
        "\n",
        "Follow this [tutorial](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/) to create a vector index on database: `haystack_test` collection `test_collection`.\n",
        "\n",
        "Verify that the index name is `vector_index` and the syntax specify:\n",
        "```\n",
        "{\n",
        "  \"fields\": [\n",
        "    {\n",
        "      \"type\": \"vector\",\n",
        "      \"path\": \"embedding\",\n",
        "      \"numDimensions\": 1536,\n",
        "      \"similarity\": \"cosine\"\n",
        "    }\n",
        "  ]\n",
        "}\n",
        "```"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "cOMyplbvOMDk"
      },
      "source": [
        "### Setup vector store to load documents:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "id": "-y9waymAOOgs"
      },
      "outputs": [],
      "source": [
        "from haystack import Document, Pipeline\n",
        "from haystack.components.builders.prompt_builder import PromptBuilder\n",
        "from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder\n",
        "from haystack.components.generators import OpenAIGenerator\n",
        "from haystack.components.writers import DocumentWriter\n",
        "from haystack.document_stores.types import DuplicatePolicy\n",
        "from haystack_integrations.components.retrievers.mongodb_atlas import (\n",
        "    MongoDBAtlasEmbeddingRetriever,\n",
        ")\n",
        "from haystack_integrations.document_stores.mongodb_atlas import (\n",
        "    MongoDBAtlasDocumentStore,\n",
        ")\n",
        "\n",
        "# Create some example documents\n",
        "documents = [\n",
        "    Document(content=\"My name is Jean and I live in Paris.\"),\n",
        "    Document(content=\"My name is Mark and I live in Berlin.\"),\n",
        "    Document(content=\"My name is Giorgio and I live in Rome.\"),\n",
        "]\n",
        "\n",
        "document_store = MongoDBAtlasDocumentStore(\n",
        "    database_name=\"haystack_test\",\n",
        "    collection_name=\"test_collection\",\n",
        "    vector_search_index=\"vector_index\",\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "3MMitwR3P0uj"
      },
      "source": [
        "Build the writer pipeline to load documnets"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "dYEo2ZkMQptv",
        "outputId": "e6a86d38-7891-4cc8-e8ed-e15e39fc86e8"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  4.16it/s]\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "{'doc_embedder': {'meta': {'model': 'text-embedding-ada-002',\n",
              "   'usage': {'prompt_tokens': 32, 'total_tokens': 32}}},\n",
              " 'doc_writer': {'documents_written': 0}}"
            ]
          },
          "execution_count": 6,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Setting up a document writer to handle the insertion of documents into the MongoDB collection.\n",
        "doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)\n",
        "\n",
        "# Initializing a document embedder to convert text content into vectorized form.\n",
        "doc_embedder = OpenAIDocumentEmbedder()\n",
        "\n",
        "# Creating a pipeline for indexing documents. The pipeline includes embedding and writing documents.\n",
        "indexing_pipe = Pipeline()\n",
        "indexing_pipe.add_component(instance=doc_embedder, name=\"doc_embedder\")\n",
        "indexing_pipe.add_component(instance=doc_writer, name=\"doc_writer\")\n",
        "\n",
        "# Connecting the components of the pipeline for document flow.\n",
        "indexing_pipe.connect(\"doc_embedder.documents\", \"doc_writer.documents\")\n",
        "\n",
        "# Running the pipeline with the list of documents to index them in MongoDB.\n",
        "indexing_pipe.run({\"doc_embedder\": {\"documents\": documents}})"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "TA_ELb_URY0D"
      },
      "source": [
        "## Build a RAG Pipeline\n",
        "\n",
        "Lets create a pipeline that will Retrieve Augment and Generate a response for user questions"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "RaNFayobRkU9",
        "outputId": "0152319b-631d-4d45-ec03-7a8c1a2561ea"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "<haystack.core.pipeline.pipeline.Pipeline object at 0x7fc98d95bdf0>\n",
              "🚅 Components\n",
              "  - text_embedder: OpenAITextEmbedder\n",
              "  - retriever: MongoDBAtlasEmbeddingRetriever\n",
              "  - prompt_builder: PromptBuilder\n",
              "  - llm: OpenAIGenerator\n",
              "🛤️ Connections\n",
              "  - text_embedder.embedding -> retriever.query_embedding (List[float])\n",
              "  - retriever.documents -> prompt_builder.documents (List[Document])\n",
              "  - prompt_builder.prompt -> llm.prompt (str)"
            ]
          },
          "execution_count": 9,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Template for generating prompts for a movie recommendation engine.\n",
        "prompt_template = \"\"\"\n",
        "    You are an assistant allowed to use the following context documents.\\nDocuments:\n",
        "    {% for doc in documents %}\n",
        "        {{ doc.content }}\n",
        "    {% endfor %}\n",
        "\n",
        "    \\Query: {{query}}\n",
        "    \\nAnswer:\n",
        "\"\"\"\n",
        "\n",
        "# Setting up a retrieval-augmented generation (RAG) pipeline for generating responses.\n",
        "rag_pipeline = Pipeline()\n",
        "rag_pipeline.add_component(\"text_embedder\", OpenAITextEmbedder())\n",
        "\n",
        "# Adding a component for retrieving related documents from MongoDB based on the query embedding.\n",
        "rag_pipeline.add_component(\n",
        "    instance=MongoDBAtlasEmbeddingRetriever(document_store=document_store, top_k=15),\n",
        "    name=\"retriever\",\n",
        ")\n",
        "\n",
        "# Building prompts based on retrieved documents to be used for generating responses.\n",
        "rag_pipeline.add_component(\n",
        "    instance=PromptBuilder(template=prompt_template), name=\"prompt_builder\"\n",
        ")\n",
        "\n",
        "# Adding a language model generator to produce the final text output.\n",
        "rag_pipeline.add_component(instance=OpenAIGenerator(), name=\"llm\")\n",
        "\n",
        "# Connecting the components of the RAG pipeline to ensure proper data flow.\n",
        "rag_pipeline.connect(\"text_embedder.embedding\", \"retriever.query_embedding\")\n",
        "rag_pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n",
        "rag_pipeline.connect(\"prompt_builder\", \"llm\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "n7mEU3ydRz8k"
      },
      "source": [
        "Lets test the pipeline"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "GHV5rRhaR3SX",
        "outputId": "dfb23085-ef80-4c99-a1c2-dd3f107ef313"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Mark lives in Berlin.\n"
          ]
        }
      ],
      "source": [
        "query = \"Where does mark live?\"\n",
        "result = rag_pipeline.run(\n",
        "    {\n",
        "        \"text_embedder\": {\"text\": query},\n",
        "        \"prompt_builder\": {\"query\": query},\n",
        "    }\n",
        ")\n",
        "print(result[\"llm\"][\"replies\"][0])"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "state": {}
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
